
* +++++++++++++++++++++
* CLEAN INTEGRATION
* COURSES DATA
* +++++++++++++++++++++

/*
Integrationskursgeschäftsstatistik: 2015 - 2019
Source: https://www.bamf.de/DE/Themen/Statistik/Integrationskurszahlen/integrationskurszahlen-node.html
*/

global data_raw_int "${data_raw}/integration_courses/"

* +++++++++++++++++++++++++
* 2016 data
* +++++++++++++++++++++++++

* 2016 data looks a little different so we have to do this separately

* number of graduates (absolventen)
import delimited "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-absolventen-csv.csv", clear delimiter(";")

export delimited "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-absolventen-csv_2.csv", replace

import delimited "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-absolventen-csv_2.csv", clear delimiter(";") rowrange(3)

erase "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-absolventen-csv_2.csv"

split v1, parse(";")
replace v1 = v11 if v11 != v1
forval i=2/3{
	replace v`i' = v1`i' if v`i' == ""
}
forval i=4/5{
	destring v1`i', replace
	replace v`i' = v1`i' if v`i' == .
}
* focus on key variables and give them meaningful names
drop v2 v11-v15
ren (v1 v3 v4 v5) /// 
	(nuts1_name nuts3_name nuts3 num_graduates_2016)
recast str2045 nuts1_name	
collapse (rawsum) num_* (first) nuts1_name nuts3_name, by(nuts3)
	
tempfile absolventen_2016
save `absolventen_2016'
	
* number of courses	
import delimited "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-integrationskurse-csv.csv", clear delimiter(";")	

export delimited "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-integrationskurse-csv_2.csv", replace

import delimited "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-integrationskurse-csv_2.csv", clear delimiter(";") rowrange(3)

erase "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-integrationskurse-csv_2.csv"

split v1, parse(";")
replace v1 = v11 if v11 != v1
forval i=2/3{
	replace v`i' = v1`i' if v`i' == ""
}
forval i=4/6{
	destring v1`i', replace
	replace v`i' = v1`i' if v`i' == .
}
* focus on key variables and give them meaningful names
drop v2 v11-v16
ren (v1 v3 v4 v5 v6) ///  
	(nuts1_name nuts3_name nuts3 num_courses_start_2016 num_courses_compl_2016)
recast str2045 nuts1_name	
collapse (rawsum) num_* (first) nuts1_name nuts3_name, by(nuts3)

tempfile courses_2016
save `courses_2016'		
	
* number of people eligible to take classes	
import delimited "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-teilnahmeberechtigungen-csv.csv", clear	

export delimited "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-teilnahmeberechtigungen-csv_2.csv", replace

import delimited "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-teilnahmeberechtigungen-csv_2.csv", clear delimiter(";") rowrange(3)

erase "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-teilnahmeberechtigungen-csv_2.csv"

split v1, parse(";")
replace v1 = v11 if v11 != v1
forval i=2/3{
	replace v`i' = v1`i' if v`i' == ""
}
forval i=4/5{
	destring v1`i', replace
	replace v`i' = v1`i' if v`i' == .
}
* focus on key variables and give them meaningful names
drop v2 v11-v15
ren (v1 v3 v4 v5) /// 
	(nuts1_name nuts3_name nuts3 num_new_eligible_2016)
recast str2045 nuts1_name	
collapse (rawsum) num_* (first) nuts1_name nuts3_name, by(nuts3)

tempfile eligible_2016
save `eligible_2016'		
	
* number of participants
import delimited "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-Teilnehmer-csv.csv", clear	

export delimited "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-Teilnehmer-csv_2.csv", replace

import delimited "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-Teilnehmer-csv_2.csv", clear delimiter(";") rowrange(3)

erase "${data_raw_int}/2016/2016-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-Teilnehmer-csv_2.csv"

split v1, parse(";")
replace v1 = v11 if v11 != v1
forval i=2/3{
	replace v`i' = v1`i' if v`i' == ""
}
forval i=4/5{
	destring v1`i', replace
	replace v`i' = v1`i' if v`i' == .
}
* focus on key variables and give them meaningful names
drop v2 v11-v15
ren (v1 v3 v4 v5) ///
	(nuts1_name nuts3_name nuts3 num_new_partic_2016)
recast str2045 nuts1_name	
collapse (rawsum) num_* (first) nuts1_name nuts3_name, by(nuts3)	

tempfile partic_2016
save `partic_2016'		

* combine various files and clean up 
use `absolventen_2016', clear
foreach f in courses eligible partic {
	merge 1:1 nuts3 using ``f'_2016', keep(1 2 3) assert(1 2 3) nogen
}
replace nuts1_name = "Baden-Württemberg" if nuts1_name =="Baden-Wrttemberg"
replace nuts1_name = "Thüringen" if nuts1_name =="Thringen"
tempfile file_2016
save `file_2016'
	
* +++++++++++++++++++++++++
* 2015, 2017-2019 data
* +++++++++++++++++++++++++	
	
* these files have a very similar structure so we can do them in a loop	
	
foreach year in 2015 2017 2018 2019 {
	di "`year'"
	if `year' == 2015 {
		local old_vars "wohnortbundesland wohnortkreisdesc"
		local new_vars "nuts1_name nuts3_name"
		local file "absolventen"
		local var "graduates"
		local match_var "`new_vars'"
		local collapse "collapse (rawsum) num_*, by(`match_var')"
	}
	if `year' == 2017 | `year' ==2019 {
		local old_vars "wohnortbundesland wohnortkreisdesc wohnortkreisid"
		local new_vars "nuts1_name nuts3_name nuts3"
		local file "austritte"
		local var "left"
		local match_var "nuts3"
		local collapse "collapse (rawsum) num_* (first) nuts1_name nuts3_name, by(`match_var')"
	}
	if `year' == 2018 {
		local old_vars "wohnortbundesland wohnortkreisdesc wohnortkreisid"
		local new_vars "nuts1_name nuts3_name nuts3"
		local file "austritte"
		local var "left"
		local match_var "nuts3"
	}
	
	* graduates / finishers (absolventen / austritte)
	import delimited "${data_raw_int}//`year'//`year'-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-`file'-csv.csv", clear

	keep `old_vars' anzahlderkurs`file'
	ren (`old_vars' anzahlderkurs`file') /// 
		(`new_vars' num_`var'_`year')
	recast str2045 nuts1_name
	if `year' == 2015 {
		collapse (rawsum) num_*, by(`match_var')
	}
	else {
		collapse (rawsum) num_* (first) nuts1_name nuts3_name, by(`match_var')
	}
	tempfile `file'_`year'
	save ``file'_`year''
		
	* number of courses
	import delimited "${data_raw_int}//`year'//`year'-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-integrationskurse-csv.csv", clear
		
	keep *kursort*bundesland* *kursort*kreis* anzahlderbegonnenenkurse anzahlderbeendetenkurse
	ren *kursort*bundesland* nuts1_name
	ren *kursort*kreisdesc* nuts3_name
	if `year' != 2015 {
		ren kursortkursbeginnkreisid nuts3
	}
	ren (anzahlderbegonnenenkurse anzahlderbeendetenkurse) /// 
		(num_courses_start_`year' num_courses_compl_`year')	
	recast str2045 nuts1_name
	if `year' == 2015 {
		collapse (rawsum) num_*, by(`match_var')
	}
	else {
		collapse (rawsum) num_* (first) nuts1_name nuts3_name, by(`match_var')
	}
	tempfile courses_`year'
	save `courses_`year''	
		
	* number of people being eligible
	import delimited "${data_raw_int}//`year'//`year'-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-teilnahmeberechtigungen-csv.csv", clear

	keep `old_vars' anzahlderneuenteilnahmeberechtig
	ren (`old_vars' anzahlderneuenteilnahmeberechtig) /// 
		(`new_vars' num_new_eligible_`year')
	recast str2045 nuts1_name
	if `year' == 2015 {
		collapse (rawsum) num_*, by(`match_var')
	}
	else {
		collapse (rawsum) num_* (first) nuts1_name nuts3_name, by(`match_var')
	}
	tempfile eligible_`year'
	save `eligible_`year''	
		
	* number of participants	
	import delimited "${data_raw_int}//`year'//`year'-gesamt-anlage-integrationskursgeschaeftsstatistik-kreise-Teilnehmer-csv.csv", clear	

	keep `old_vars' anzahlderneuenkursteilnehmer
	ren (`old_vars' anzahlderneuenkursteilnehmer) /// 
		(`new_vars' num_new_partic_`year')
	recast str2045 nuts1_name
	if `year' == 2015 {
		collapse (rawsum) num_*, by(`match_var')
	}
	else {
		collapse (rawsum) num_* (first) nuts1_name nuts3_name, by(`match_var')
	}
	tempfile partic_`year'
	save `partic_`year''
		
	* combine files	by year
	use ``file'_`year'', clear
	foreach f in courses eligible partic {
		merge 1:1 `match_var' using ``f'_`year'', keep(1 2 3) assert(1 2 3) nogen
	}
	replace nuts1_name = "Baden-Württemberg" if nuts1_name =="Baden-Wrttemberg"
	replace nuts1_name = "Thüringen" if nuts1_name =="Thringen"
	if `year' == 2015 {
		gen fake_id = _n
	}
	foreach var of varlist num_* {
		tostring `var', gen(`var'_str) force
		replace `var' = `var' * 1000 if strpos(`var'_str, ".")!=0
		drop *_str
	}
	tempfile file_`year'
	save `file_`year''
}

* combine files for each different year 
use `file_2017', clear
foreach i in 2016 2018 2019 {
	merge 1:1 nuts3 using `file_`i'', nogen 
}
drop if nuts3 ==. | nuts3 == 0

* for 2015 we need an extra round to do this because the file only has place names and no IDs
preserve 
keep nuts1_name nuts3_name nuts3
reclink2 nuts1_name nuts3_name using `file_2015', idmaster(nuts3) /// 
	idusing(fake_id) required(nuts1_name) gen(_match)
bys fake_id : egen max_match = max(_match)
keep if _match == max_match 
list nuts3_name nuts3 if fake_id ==. 
keep fake_id nuts3 
keep if fake_id !=.
tempfile cw
save `cw'

use `file_2015', clear
merge 1:1 fake_id using `cw', keep(1 3) nogen
replace nuts3 = 5358 if nuts3_name == "Dren"
replace nuts3 = 9464 if nuts3_name == "Hof, Stadt"
replace nuts3 = 9564 if nuts3_name == "Nrnberg, Stadt"
replace nuts3 = 9563 if nuts3_name == "Frth, Stadt"
replace nuts3 = 9573 if nuts3_name == "Frth"
replace nuts3 = 1057 if nuts3_name == "Pln"
replace nuts3 = 8421 if nuts3_name == "Ulm"
replace nuts3 = 15083 if nuts3_name == "Brde"
replace nuts3 = 5334 if strpos(nuts3_name, "Aachen")!=0
collapse (rawsum) num_* (first) *name,  by(nuts3)
drop if nuts3 ==.

tempfile file_2015_cleaned_up 
save `file_2015_cleaned_up'
restore

merge 1:1 nuts3 using `file_2015_cleaned_up', nogen keep(1 3)

* create statistics for all years combined
foreach var in num_courses_compl num_courses_start num_new_eligible ///
	num_new_partic { 
		egen `var'_15_19 = rowtotal(`var'_*)
	}
egen num_graduates_15_16 = rowtotal(num_graduates*)
egen num_left_17_19 = rowtotal(num_left*)
gen num_finished_15_19 = num_graduates_15_16 + num_left_17_19

* clean up
order nuts1_name nuts3_name nuts3 *_15_19 *2015 *2016 *2017 *2018 *2019, first
ren nuts3 ags

* save
save "${data_derived}/integration_by_nuts3_2015_to_2019.dta", replace
export delimited "${data_derived}/integration_by_nuts3_2015_to_2019.csv", replace

